SSNbayes: Bayesian spatial stream networks.libPaths('C:\\1\\R')
library('Rcpp')
library('ggplot2')
library('dplyr')
library('reshape2')
library('spdep')
library('RColorBrewer')
library('ggrepel')
library('viridis')
library('SSN')
library('SSNdesign')
library('rstan')
library('ggmcmc')
library('StanHeaders')
library('knitr')
library('plotly')
In this tutorial fit a Bayesian tail-up exponential spatial stream network model that has been implemented using likelihood methods in the R package SSN (Hoef et al. 2014).
\[ y = X\beta + \varepsilon \]
For a pair of sites \(s_i\), \(s_j\) the tail-up covariance matrix is
\[C_{TU}(s_i,s_j|\theta) = \left\{\begin{matrix} 0 \:\:\: \textrm{if $s_i,s_j$ are flow unconnected} \\W_{ij} C_t \:\:\: \textrm{if $s_i,s_j$ are flow connected} \end{matrix}\right.\]
The matrix \(W_{ij}\) contains the spatial weights defined by the branching structure of the network.
The tail-up exponential is given by \(C_t\left (h| \theta \right ) = \sigma^2_{u}e^{-3h/\alpha}\)
where \(\sigma^2_{u}\) is the partial sill and \(\alpha\) is the range parameter.
\(Var(\varepsilon) = \Sigma\) where \(\Sigma = C_t \odot W_{ij} + \sigma^2_{NU}I\).
path <- "./bayes32.ssn"
set.seed(20200618)
n <- createSSN(
n = c(300),
obsDesign = poissonDesign(c(0.5)),
predDesign = poissonDesign(c(0.33)),
importToR = TRUE,
path = path,
treeFunction = iterativeTreeLayout
)
This stream network is composed by following observation points:
## [1] 151
and prediction locations:
## [1] 104
col <- brewer.pal(9,'YlGnBu')[6]
plot(
n,
lwdLineCol = "addfunccol",
lwdLineEx = 8,
lineCol = col,
col = 1,
pch = 16,
cex = 1,
xlab = "x-coordinate",
ylab = "y-coordinate"
)
plot(n, PredPointsID = "preds", add = T, pch = 15, cex = 1, col = "#E41A1C")
createDistMat(n, "preds", o.write=TRUE, amongpred = TRUE)
rawDFobs <- getSSNdata.frame(n, "Obs")
rawDFpred <- getSSNdata.frame(n, "preds")
# generating continous covariates
rawDFobs[,"X1"] <- rnorm(length(rawDFobs[,1]))
rawDFpred[,"X1"] <- rnorm(length(rawDFpred[,1]))
rawDFobs[,"X2"] <- rnorm(length(rawDFobs[,1]))
rawDFpred[,"X2"] <- rnorm(length(rawDFpred[,1]))
rawDFobs[,"X3"] <- rnorm(length(rawDFobs[,1]))
rawDFpred[,"X3"] <- rnorm(length(rawDFpred[,1]))
n <- putSSNdata.frame(rawDFobs,n, Name = 'Obs')
n <- putSSNdata.frame(rawDFpred,n, Name = 'preds')
Setting the parameters \(\sigma^2_{TU} = 3\), \(\alpha = 10\) and \(\sigma^2_{NU} = 0.1\). Using an intercept \(\beta_0= 10\) and slopes \(\beta_{1,2,3} = \{1,0,-1\}\).
set.seed(66)
sim.out <- SimulateOnSSN(
n,
ObsSimDF = rawDFobs,
PredSimDF = rawDFpred,
PredID = "preds",
formula = ~ X1 + X2 + X3,
coefficients = c(10, 1, 0, -1),
CorModels = c(
"Exponential.tailup"),
use.nugget = TRUE,
CorParms = c(3, 10,.1),
addfunccol = "addfunccol"
)
sim.out <- readRDS('sim.out_32.rds')
sim.ssn <- sim.out$ssn.object
simDFobs <- getSSNdata.frame(sim.ssn, "Obs")
simDFpred <- getSSNdata.frame(sim.ssn, "preds")
simpreds <- simDFpred[,"Sim_Values"]
simDFpred[,"Sim_Values"] <- NA
sim.ssn <- putSSNdata.frame(simDFpred, sim.ssn, "preds")
Fitting the model using SSN::glmssn() and obtaining the predictions.
The regression coefficients are well retrieved. The estimates of $^2_{TU} $ and \(\sigma^2_{NU}\) are quite precise, while \(\alpha\) is overestimated.
glmssn.out <- glmssn(Sim_Values ~ X1 + X2 + X3, sim.ssn,
CorModels = c("Exponential.tailup"),
addfunccol = "addfunccol")
summary(glmssn.out)
##
## Call:
## glmssn(formula = Sim_Values ~ X1 + X2 + X3, ssn.object = sim.ssn,
## CorModels = c("Exponential.tailup"), addfunccol = "addfunccol")
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6983 -1.4411 -0.0251 1.0826 3.8906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.31406 0.21018 49.07 <2e-16 ***
## X1 1.02921 0.07205 14.29 <2e-16 ***
## X2 -0.01472 0.06134 -0.24 0.811
## X3 -0.98576 0.08894 -11.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Covariance Parameters:
## Covariance.Model Parameter Estimate
## Exponential.tailup parsill 3.2647
## Exponential.tailup range 22.2954
## Nugget parsill 0.0999
##
## Residual standard error: 1.834283
## Generalized R-squared: 0.7048008
glmssn.pred <- predict(glmssn.out, "preds")
predDF <- getSSNdata.frame(glmssn.pred, "preds")
How good are the predictions?
ggplot() + geom_point(aes(x = simpreds,
y = predDF[, "Sim_Values"])) +
geom_abline(intercept = 0, slope = 1)+
xlab("True") +
ylab("Predicted") +
xlim(2.5,17.5)+
ylim(2.5,17.5)+
coord_fixed() +
theme_bw()
Creating the downstream hydrologic distances. Here the columns representing the FROM site and rows representing the TO site
createDistMat(n, "preds", o.write=TRUE, amongpred = TRUE)
do <- readRDS(paste0(path, '/distance/obs/dist.net1.RData')) # distance between observations
dp <- readRDS(paste0(path, '/distance/preds/dist.net1.RData')) # dist between pred points
dpo <- readRDS(paste0(path, '/distance/preds/dist.net1.a.RData')) #dist observation to prediction
dop <- readRDS(paste0(path, '/distance/preds/dist.net1.b.RData')) #dist prediction to observation
# matrix containing all downstream distances
D <- rbind(cbind(do, dpo), cbind(dop, dp))
# total distance
H <- D + base::t(D)
Creating data frames with the observations and predictions
obs_data <- getSSNdata.frame(sim.ssn, "Obs")
pred_data <- getSSNdata.frame(sim.ssn, "preds")
Obtaining the spatial weight matrix (W) of the observations and predictions based on the watershed area.
afv <- rbind(obs_data[c('pid', 'addfunccol')],
pred_data[c('pid', 'addfunccol')])
# codes adapated from from SSN::glmssn()
nsofar <- 0
dist.junc <- matrix(0, nrow = length(afv[, 1]), ncol = length(afv[,1]))
distmat <- D
ni <- length(distmat[1,])
ordpi <- order(as.numeric(rownames(distmat)))
dist.junc[(nsofar + 1):(nsofar + ni), (nsofar + 1):(nsofar + ni)] <-
distmat[ordpi, ordpi, drop = F]
b.mat <- pmin(dist.junc, base::t(dist.junc))
dist.hydro <- as.matrix(dist.junc + base::t(dist.junc))
flow.con.mat <- 1 - (b.mat > 0) * 1
n.all <- ni #NB
w.matrix <- sqrt(pmin(outer(afv[, 'addfunccol'],rep(1, times = n.all)),
base::t(outer(afv[, 'addfunccol'],rep(1, times = n.all) ))) /
pmax(outer(afv[, 'addfunccol'],rep(1, times = n.all)),
base::t(outer(afv[, 'addfunccol'], rep(1, times = n.all))))) *
flow.con.mat
We refer to this model as SSNbayes. Creating a list with the observations/predictions and covariates. Note that we use index arrays and slicing to include missing data in the prediction points dataset.
N <- nrow(obs_data) + nrow(pred_data)
data2 <- list(N = N, # obs + preds points
K = 4, # ncol of design matrix
y_obs = obs_data$Sim_Values, # y values in the obs df
N_y_obs = nrow(obs_data), # numb obs points
N_y_mis = nrow(pred_data), # numb preds points
i_y_obs = 1:nrow(obs_data), # index of obs points
i_y_mis = (nrow(obs_data)+1):(nrow(obs_data) + nrow(pred_data)), # index of preds points
X = rbind(cbind(1,obs_data[, c("X1", "X2", "X3")]), # obs + preds design matrix
cbind(1,pred_data[, c("X1", "X2", "X3")])),
h = H, # total stream distance
W = w.matrix, # spatial weights
I = diag(1, N, N) # diagonal matrix
)
We code the Stan model using the tail up exponential structure:
ssn_model2 <- "
data {
int<lower=1> N;
int<lower=1> K;
matrix[N,K] X;
int<lower = 0> N_y_obs; // number observed values
int<lower = 0> N_y_mis; // number missing values
int<lower = 1, upper = N_y_obs + N_y_mis> i_y_obs[N_y_obs]; // i index of observed
int<lower = 1, upper = N_y_obs + N_y_mis> i_y_mis[N_y_mis]; // i index of missing
vector[N_y_obs] y_obs; //
matrix[N, N] W ; // spatial weights
matrix[N, N] h ; // total hydrological dist
matrix[N, N] I ; // diag matrix
}
parameters {
real<lower=0> sigma_tu;
real<lower=0> sigma_nug;
vector[K] beta;
real<lower=0> alpha;
vector[N_y_mis] y_mis; //declaring the missing y
}
transformed parameters {
vector[N] y;
real<lower=0> var_tu; // parsil
real<lower=0> var_nug; // nugget
matrix[N, N] C_tu;
matrix[N, N] C1;
y[i_y_obs] = y_obs;
y[i_y_mis] = y_mis;
var_tu = sigma_tu ^ 2;
var_nug = sigma_nug ^ 2;
C1 = var_tu * exp(- 3 * h / alpha); // tail up exponential model
C_tu = C1 .* W; // Hadamard (element-wise) product
}
model {
matrix[N, N] L_Sigma = cholesky_decompose(C_tu + var_nug * I); // NB
target += multi_normal_cholesky_lpdf(y | X * beta, L_Sigma );
sigma_tu ~ cauchy(0,3); // prior on sd tail-up model
sigma_nug ~ cauchy(0,1); // prior on nugget effect
alpha ~ uniform(0,25); // NB: need to change this by uniform discrete with a log steps
}
"
We initialize using the paramaters estimates obtained using the SSN::glmssn() call made above:
ini <- function(){list(var_tu = glmssn.out$estimates$theta[1],
alpha = glmssn.out$estimates$theta[2],
var_nug = glmssn.out$estimates$theta[3]
)}
# this model should take approx 30 mins running
fit2 <- stan(model_code = ssn_model2,
model_name = "ssn_model2",
pars = c('var_tu', 'alpha', 'var_nug','beta', 'sigma_tu', 'sigma_nug', 'y'),
data = data2,
init = ini,
iter = 4000,
warmup = 2000,
thin = 2,
chains = 3,
verbose = F,
seed = 999,
refresh = max(5000/100, 1)
)
Parameter estimates and summary statistics:
stats <- summary(fit2)
stats <- stats$summary
knitr::kable(stats[1:7,])
| mean | se_mean | sd | 2.5% | 25% | 50% | 75% | 97.5% | n_eff | Rhat | |
|---|---|---|---|---|---|---|---|---|---|---|
| var_tu | 3.1755614 | 0.0142915 | 0.4735066 | 2.3651773 | 2.8383124 | 3.1385710 | 3.4596220 | 4.2508098 | 1097.7287 | 1.004049 |
| alpha | 18.8508490 | 0.1468276 | 3.9807292 | 10.2824640 | 15.9709579 | 19.2763195 | 22.1702152 | 24.7262010 | 735.0377 | 1.001912 |
| var_nug | 0.1068068 | 0.0021839 | 0.0570671 | 0.0286625 | 0.0653186 | 0.0954218 | 0.1366471 | 0.2391214 | 682.8048 | 1.009291 |
| beta[1] | 10.2949676 | 0.0060626 | 0.2110082 | 9.8726865 | 10.1531553 | 10.2959287 | 10.4387988 | 10.6980608 | 1211.3775 | 1.002170 |
| beta[2] | 1.0389127 | 0.0023540 | 0.0758226 | 0.8894299 | 0.9888071 | 1.0399870 | 1.0893453 | 1.1899727 | 1037.4866 | 1.000586 |
| beta[3] | -0.0130249 | 0.0024405 | 0.0666217 | -0.1437327 | -0.0562815 | -0.0132675 | 0.0315202 | 0.1156476 | 745.1964 | 1.010958 |
| beta[4] | -0.9880633 | 0.0031736 | 0.0926282 | -1.1785926 | -1.0466877 | -0.9873243 | -0.9259296 | -0.8175289 | 851.8705 | 1.002319 |
Comparing the model predictions with the latent value:
ypred <- data.frame(stats[grep("y\\[", row.names(stats)),][(nrow(obs_data)+1):(nrow(obs_data) + nrow(pred_data)),])
ypred$ytrue <- simpreds
ypred$ypred_glmssn <- predDF[, "Sim_Values"]
# lm() no spatial
lm_fit <- lm(Sim_Values ~ X1 + X2 + X3, obs_data)
lm_pred <- stats::predict(lm_fit, newdata = pred_data[,c('X1' , 'X2' , 'X3')])
ypred$ypred_lm <- lm_pred
ypred$ypred_ssnbayes <- ypred$mean
ypred$id <- as.numeric(regmatches(rownames(ypred), gregexpr("[[:digit:]]+", rownames(ypred))))
ypredm <- melt(ypred[, c("id", "ytrue", "ypred_lm", "ypred_glmssn", "ypred_ssnbayes")],
id.vars = c('id', "ytrue"))
Plotting the predictions obtained using the three different methods: lm(), glmssn() and SSNbayes.
p <- ggplot(ypredm) +
geom_errorbar(data = ypred, aes(x=ytrue, ymin=X97.5., ymax=X2.5.), col = 3, width=0.2, size=0.5, alpha = 0.25) +
geom_point(aes(x = ytrue, y = value, group = variable, col = variable)) +
stat_smooth(geom='line', method = 'lm', aes(x = ytrue, y = value, group = variable, col = variable), se = F, alpha = 0.7) +
geom_abline(intercept = 0, slope = 1)+
scale_color_manual(values = c(1,2,3))+
xlab("True") +
ylab("Predicted") +
xlim(2.5,17.5)+
ylim(2.5,17.5)+
coord_fixed() + theme_bw()
ggplotly(p)
Comparing the RMSE produced by the diffent models:
(rmse_lm <- sqrt(mean( ( (ypred$ytrue) - ypred$ypred_lm)^2))) # using lm()
## [1] 1.805427
(rmse_glmssn <- sqrt(mean( ( (ypred$ytrue) - ypred$ypred_glmssn)^2))) # using glmssn()
## [1] 1.424941
(rmse_ssnbayes <- sqrt(mean( ( (ypred$ytrue) - ypred$ypred_ssnbayes)^2))) # using SSNbayes()
## [1] 1.411547
the Bayesian approach produces similar predictions, fixed effects and spatial parameter estimates to those obtained using SSN::glmssn().
slightly better RMSE values are produced by SSNbayes.
implement other covariance structures and tail-down models
produce spatio temporal variations
add random effects
add more than one stream network
source("C:\\1\\DBDA2E-utilities2.R") # from: Kruschke, J. K. (2015). Doing Bayesian Data Analysis
\(\sigma^2_{TU}\)
diagMCMC( codaObject=As.mcmc.list(fit2,pars=c("var_tu","alpha","var_nug")), parName="var_tu" ) #
range \(\alpha\)
diagMCMC( codaObject=As.mcmc.list(fit2,pars=c("var_tu","alpha","var_nug")), parName="alpha" ) #
\(\sigma^2_{NU}\)
diagMCMC( codaObject=As.mcmc.list(fit2,pars=c("var_tu","alpha","var_nug")), parName="var_nug" )
Regression coefficients:
diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[1]") )
diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[2]") )
diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[3]") )
diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[4]") )
First prediction point \(y\):
diagMCMC( codaObject=As.mcmc.list(fit2,pars="y") , parName=c("y[152]") )
Hoef, Jay M. Ver, Erin E. Peterson, David Clifford, and Rohan Shah. 2014. “SSN: An R Package for Spatial Statistical Modeling on Stream Networks.” Journal of Statistical Software 56 (3): 1–45. http://www.jstatsoft.org/v56/i03/.